1. Data description

We consider data about the number of suicides in 33 boroughs of London (2012-2014) (datasourcelink). The file LondonSuicides.csv contains data about the following variables: the number of suicides (y), the number of expected cases of suicides (E) and the value of the social deprivation index (x1).

We first import and explore the data.

mydir = "/Users/michelacameletti/Dropbox/INLA - course/Glasgow 2019/Tutorial/"
#mydir = "/Users/marta/Dropbox/Teaching/INLA/Glasgow 2019/Tutorial/"
setwd(mydir)
LondonSuicides = read.csv("LondonSuicides.csv",sep=";",header=T)

LondonSuicides$SMR = LondonSuicides$y/LondonSuicides$E

dim(LondonSuicides)
## [1] 33  6
head(LondonSuicides)
##    GSS_CODE                 NAME  y    E    x1       SMR
## 1 E09000001       City of London  4  1.5 12.84 2.6666667
## 2 E09000002 Barking and Dagenham 34 37.5 34.49 0.9066667
## 3 E09000003               Barnet 68 71.6 21.16 0.9497207
## 4 E09000004               Bexley 57 46.6 16.21 1.2231760
## 5 E09000005                Brent 64 62.5 29.22 1.0240000
## 6 E09000006              Bromley 72 62.2 14.36 1.1575563
str(LondonSuicides)
## 'data.frame':    33 obs. of  6 variables:
##  $ GSS_CODE: Factor w/ 33 levels "E09000001","E09000002",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ NAME    : Factor w/ 33 levels "Barking and Dagenham",..: 7 1 2 3 4 5 6 8 9 10 ...
##  $ y       : int  4 34 68 57 64 72 53 83 48 50 ...
##  $ E       : num  1.5 37.5 71.6 46.6 62.5 62.2 44.1 73.1 68 62.9 ...
##  $ x1      : num  12.8 34.5 21.2 16.2 29.2 ...
##  $ SMR     : num  2.667 0.907 0.95 1.223 1.024 ...
range(LondonSuicides$y)
## [1]  4 83
mean(LondonSuicides$y)
## [1] 49.78788
var(LondonSuicides$y)
## [1] 275.1723

We have also a shapefile (LDNSuicides.shp) which contains the geographical information about the 33 boroughs of London. We import the shapefile using the readOGR function and then explore the data table.

library(rgdal)
## Loading required package: sp
## rgdal: version: 1.4-3, (SVN revision 828)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.1.3, released 2017/20/01
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
##  GDAL binary built with GEOS: FALSE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
##  Linking to sp version: 1.3-1
london.shp = readOGR(paste0(mydir,"London_Borough.shp"))
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/michelacameletti/Dropbox/INLA - course/Glasgow 2019/Tutorial/London_Borough.shp", layer: "London_Borough"
## with 33 features
## It has 7 fields
head(london.shp@data)
##                   NAME  GSS_CODE  HECTARES NONLD_AREA ONS_INNER SUB_2009 SUB_2006
## 0 Kingston upon Thames E09000021  3726.117      0.000         F     <NA>     <NA>
## 1              Croydon E09000008  8649.441      0.000         F     <NA>     <NA>
## 2              Bromley E09000006 15013.487      0.000         F     <NA>     <NA>
## 3             Hounslow E09000018  5658.541     60.755         F     <NA>     <NA>
## 4               Ealing E09000009  5554.428      0.000         F     <NA>     <NA>
## 5             Havering E09000016 11445.735    210.763         F     <NA>     <NA>
dim(london.shp@data)
## [1] 33  7
plot(london.shp)

It is possible to plot the name of each borough (NAME) by placing it in the centroid of each area whose coordinate can be extracted with the coordinates function.

head(coordinates(london.shp))
##       [,1]     [,2]
## 0 519297.6 166820.0
## 1 533290.2 163541.2
## 2 542895.5 165655.5
## 3 513515.5 175643.2
## 4 515887.9 181715.5
## 5 554049.0 187392.0
plot(london.shp)
text(coordinates(london.shp)[,1],
     coordinates(london.shp)[,2],
     london.shp@data$NAME, cex=0.7)

Once a variabile is available in the shapefile data table it is very easy to plot it. See for example the following code for obtaining the map of the HECTARES variable by using the spplot function.

library(sp)
spplot(london.shp,"HECTARES", col.regions = heat.colors(50))

1.1 Adjacency matrix creation

For implementing the spatial model we will later need the adjacency matrix (see page 40-43 of the lecture slides). The function poly2nb creates a binary adjacency matrix, so that two regions are neighbours only if they share at least one point in common boundary (i.e., queen adjacency).

library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge package with:
## `install.packages('spDataLarge', repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
london.adj = poly2nb(london.shp,queen = T)
summary(london.adj)
## Neighbour list object:
## Number of regions: 33 
## Number of nonzero links: 136 
## Percentage nonzero weights: 12.48852 
## Average number of links: 4.121212 
## Link number distribution:
## 
##  2  3  4  5  6  7 
##  2 10  9  7  4  1 
## 2 least connected regions:
## 5 14 with 2 links
## 1 most connected region:
## 8 with 7 links

The following code can be used to plot the neighbourhood structure.

plot(london.shp)
plot(london.adj, coordinates(london.shp),
     pch = ".", add = TRUE, col=4)
points(coordinates(london.shp)[,1],coordinates(london.shp)[,2],bg="red", pch=21, cex=0.75)

It can be seen that due the river some areas are not connected. To avoid this artefact we use the snap argument (boundary points less than snap distance apart are considered to indicate contiguity; see link.

london.adj = poly2nb(london.shp,snap=700, queen = T)
summary(london.adj)
## Neighbour list object:
## Number of regions: 33 
## Number of nonzero links: 176 
## Percentage nonzero weights: 16.16162 
## Average number of links: 5.333333 
## Link number distribution:
## 
## 3 4 5 6 7 8 
## 4 6 7 9 5 2 
## 4 least connected regions:
## 5 6 15 18 with 3 links
## 2 most connected regions:
## 10 24 with 8 links
plot(london.shp)
plot(london.adj, coordinates(london.shp),
     pch = ".", add = TRUE, col=4)
points(coordinates(london.shp)[,1],coordinates(london.shp)[,2],bg="red", pch=21, cex=0.75)

The function nb2INLA creates a file in the specified directory with the representation of the adjacency matrix as required by INLA for its spatial models.

nb2INLA(file=paste0(mydir,"london.graph"),
        nb=london.adj)
dir() #see the new file london.graph
##  [1] "Data.xlsx"             "LondonSuicides.csv"    "LondonSuicides_.csv"   "London_Borough.dbf"   
##  [5] "London_Borough.shp"    "London_Borough.shx"    "SHP"                   "Tutorial_R_INLA.Rmd"  
##  [9] "Tutorial_R_INLA.html"  "Tutorial_R_INLA_.html" "Tutorial_R_INLA_files" "london.graph"
london.adj.path = paste0(mydir,"london.graph")

2 Poisson-logNormal model (with covariate only)

We implement the following Poisson regression model for the observed number of suicides in the i-th area (\(y_i\)): \[ y_i\sim \mbox{Poisson}(\phi_i E_i) \;\;\; i=1,...,33 \] where \(\phi_i\) is the relative risk and \(E_i\) the expected number of cases.

The linear predictor is defined on the logarithm scale and is given by \[ \eta_i=\log(\phi_i)=\beta_0 + \beta_1 x_{1i} \] where \(\beta_0\) is the intercept, \(x_{1i}\) is the value of the covariate \(x_1\) for the i-th area, \(\beta_1\) is the covariate coefficient.

By default the intercept \(\beta_0\) has a Gaussian prior with mean and precision equal to zero. The covariate coefficient \(\beta_1\) has a Gaussian prior by default with zero mean and precision equal to 0.001.

library(INLA)
## Warning: package 'INLA' was built under R version 3.6.1
## Loading required package: Matrix
## Loading required package: parallel
## This is INLA_19.07.21 built 2019-07-21 14:38:44 UTC.
## See www.r-inla.org/contact-us for how to get help.
## To enable PARDISO sparse library; see inla.pardiso()
formula = y ~ 1 + x1 
output0 = inla(formula,
              family = "poisson",
              data = LondonSuicides,
              E = E)

summary(output0)
## 
## Call:
##    c("inla(formula = formula, family = \"poisson\", data = LondonSuicides, ", " E = E)") 
## Time used:
##     Pre = 1.48, Running = 0.23, Post = 0.293, Total = 2.01 
## Fixed effects:
##               mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept) -0.189 0.074     -0.336   -0.189     -0.045 -0.189   0
## x1           0.007 0.003      0.002    0.007      0.012  0.007   0
## 
## Expected number of effective parameters(stdev): 2.01(0.00)
## Number of equivalent replicates : 16.42 
## 
## Marginal log-Likelihood:  -138.26

Note that R-INLA provides an estimate of the effective number of parameters (a measure of model complexity) and of the number of equivalent replicates (a measure of the number of independent observations in the data).

To account for overdispersion in the data we include an i.i.d. Gaussian random effect in the model.

3 Poisson-logNormal model (with covariate + iid random effect)

We implement the following Poisson regression model (see page 26 of lecture slides) for the observed number of suicides in the i-th area (\(y_i\)): \[ y_i\sim \mbox{Poisson}(\phi_i E_i) \;\;\; i=1,...,33 \] where \(\phi_i\) is the relative risk and \(E_i\) the expected number of cases.

The linear predictor is defined on the logarithm scale and is given by \[ \eta_i=\log(\phi_i)=\beta_0 + \beta_1 x_{1i} + v_{i}, \] where \(\beta_0\) is the intercept, \(x_{1i}\) is the value of the covariate \(x_1\) for the i-th area, \(\beta_1\) is the covariate coefficient. \(v_i\) represents the random effect which is assumed to be normally distributed with mean equal to zero and precision \(\tau_v\). The precision \(\tau_{v}\) is an hyperparameter with default prior given by \(\mbox{logGamma}\sim(1, 0.00005)\) (see inla.models()$latent$iid$hyper).

3.1 Run INLA

The model we want to estimate contains two fixed effects (the coefficients \(\beta_0\) and \(\beta_1\)) and one random effect (\(v_i\)), which will be specified in R-INLA using the f(...) function. To deal with the random effect INLA needs an index variable (id) to map the effect to the areas.

LondonSuicides$id = seq(1:nrow(LondonSuicides))
head(LondonSuicides)
##    GSS_CODE                 NAME  y    E    x1       SMR id
## 1 E09000001       City of London  4  1.5 12.84 2.6666667  1
## 2 E09000002 Barking and Dagenham 34 37.5 34.49 0.9066667  2
## 3 E09000003               Barnet 68 71.6 21.16 0.9497207  3
## 4 E09000004               Bexley 57 46.6 16.21 1.2231760  4
## 5 E09000005                Brent 64 62.5 29.22 1.0240000  5
## 6 E09000006              Bromley 72 62.2 14.36 1.1575563  6

To run the model in INLA we have to specify first of all the formula

formula = y ~ 1 + x1 + f(id,model="iid") 

Then the inla function is run to get the output:

output1 = inla(formula,
              family = "poisson",
              data = LondonSuicides,
              E = E)

summary(output1)
## 
## Call:
##    c("inla(formula = formula, family = \"poisson\", data = LondonSuicides, ", " E = E)") 
## Time used:
##     Pre = 1.7, Running = 0.164, Post = 0.0982, Total = 1.96 
## Fixed effects:
##               mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept) -0.193 0.084     -0.362   -0.193     -0.031 -0.191   0
## x1           0.007 0.003      0.001    0.007      0.013  0.007   0
## 
## Random effects:
##   Name     Model
##     id IID model
## 
## Model hyperparameters:
##                      mean       sd 0.025quant 0.5quant 0.975quant  mode
## Precision for id 11386.28 17743.39      32.57  3083.80   61841.38 47.77
## 
## Expected number of effective parameters(stdev): 6.54(6.03)
## Number of equivalent replicates : 5.04 
## 
## Marginal log-Likelihood:  -138.75
names(output1)
##  [1] "names.fixed"                 "summary.fixed"               "marginals.fixed"            
##  [4] "summary.lincomb"             "marginals.lincomb"           "size.lincomb"               
##  [7] "summary.lincomb.derived"     "marginals.lincomb.derived"   "size.lincomb.derived"       
## [10] "mlik"                        "cpo"                         "po"                         
## [13] "waic"                        "model.random"                "summary.random"             
## [16] "marginals.random"            "size.random"                 "summary.linear.predictor"   
## [19] "marginals.linear.predictor"  "summary.fitted.values"       "marginals.fitted.values"    
## [22] "size.linear.predictor"       "summary.hyperpar"            "marginals.hyperpar"         
## [25] "internal.summary.hyperpar"   "internal.marginals.hyperpar" "offset.linear.predictor"    
## [28] "model.spde2.blc"             "summary.spde2.blc"           "marginals.spde2.blc"        
## [31] "size.spde2.blc"              "model.spde3.blc"             "summary.spde3.blc"          
## [34] "marginals.spde3.blc"         "size.spde3.blc"              "logfile"                    
## [37] "misc"                        "dic"                         "mode"                       
## [40] "neffp"                       "joint.hyper"                 "nhyper"                     
## [43] "version"                     "Q"                           "graph"                      
## [46] "ok"                          "cpu.used"                    "all.hyper"                  
## [49] ".args"                       "call"                        "model.matrix"

Note the large posterior mean of the precision of the Gaussian random effect. This could indicate that the covariate explains the overdispersion in the data well.

3.2 Exploring the output: fixed effects and hyperparameters

The posterior summary statistics of the fixed effects are contained in the summary.fixed object:

# Summary statistics of the fixed effects
output1$summary.fixed
##                    mean          sd 0.025quant     0.5quant  0.975quant         mode          kld
## (Intercept) -0.19346220 0.083730225 -0.3616186 -0.192546287 -0.03091707 -0.191038464 1.631283e-06
## x1           0.00719342 0.002933758  0.0014391  0.007180642  0.01302403  0.007163436 1.953868e-06
  • For each fixed effect R-INLA reports a set of summary statistics from the posterior distribution.
  • The value of the Kullback-Leibler divergence (kld) describes the difference between the Gaussian approximation and the Simplified Laplace Approximation (SLA) to the marginal posterior densities: small values indicate that the posterior distribution is well approximated by a Normal distribution.

The same information can be retrieved for the hyperparameters:

# Summary statistics of the hyperparameter
output1$summary.hyperpar
##                      mean       sd 0.025quant 0.5quant 0.975quant     mode
## Precision for id 11386.28 17743.39   32.57167 3083.801   61841.38 47.76675
  • For each hyperparameter the summary statistics are reported to describe the posterior distribution.
  • NB: R-INLA reports results on the precision scale (more on this later).

3.3 Manipulating the marginals: fixed effects

It is also possible to extract the posterior marginal distribution of each fixed effect parameter stored in the marginals.fixed object.

class(output1$marginals.fixed)
## [1] "list"
names(output1$marginals.fixed)
## [1] "(Intercept)" "x1"

Let assume for example that we are interested in extracting and plotting the marginal posterior distribuion of \(\beta_1\):

# Extract the marginal of beta1 (from a list so we use [[...]])
beta1_post = output1$marginals.fixed[["x1"]]  
class(beta1_post)
## [1] "matrix"
head(beta1_post)
##                 x            y
## [1,] -0.022181656 8.834798e-11
## [2,] -0.016307277 1.284916e-06
## [3,] -0.012391717 1.086740e-04
## [4,] -0.010432898 7.997325e-04
## [5,] -0.010200780 1.004317e-03
## [6,] -0.007839525 9.278027e-03
# Another function for getting summary statistics
inla.zmarginal(beta1_post) 
## Mean            0.00719343 
## Stdev           0.00293366 
## Quantile  0.025 0.00143466 
## Quantile  0.25  0.00524997 
## Quantile  0.5   0.00716636 
## Quantile  0.75  0.00909148 
## Quantile  0.975 0.0130069
# Plot the posterior marginal distribution with posterior mean
plot(beta1_post, type="l", xlab="beta1",ylab="") 
abline(v=output1$summary.fixed[2,"mean"])

It is possible to extract more information from the posterior distribution of \(\beta_1\):

# Quantile function
inla.qmarginal(0.05,beta1_post) 
## [1] 0.002409872
# Distribution function
inla.pmarginal(0.01,beta1_post)
## [1] 0.8381629
# Density function
inla.dmarginal(0.01,beta1_post) 
## [1] 83.94023
# Generate values from the distribution
inla.rmarginal(4,beta1_post) 
## [1] 0.008196722 0.006649669 0.006358957 0.008391472

To estimate the effect of the covariates on the risk of suicide we compute the posterior mean of the parameter on the natural scale (\(\exp(\beta_1)\)). This can be obtained by means of the inla.emarginal function:

inla.emarginal(fun=exp,marginal=output1$marginals.fixed$x1)
## [1] 1.007224
# 95% credible interval
inla.qmarginal(c(0.025,0.975),
              inla.tmarginal(fun=exp,mar=output1$marginals.fixed$x1)) 
## [1] 1.001469 1.013074

We obtain that an increase of 1 unit in the x1 covariate is associated with an increase of around 0.7% in the risk of suicide.

3.4 Manipulating the marginal: hyperparameter

The posterior marginal distribution of the hyperparameters (in this case just the precision of the iid random effect) is stored in the marginals.hyperpar object (which is a list):

class(output1$marginals.hyperpar)
## [1] "list"
names(output1$marginals.hyperpar)
## [1] "Precision for id"

It is possible for example to plot the precision marginal posterior distribution

prec.post = output1$marginals.hyperpar[[1]]
plot(prec.post,t="l")

3.5. Change the hyperparameter prior

Instead of the default \(\mbox{logGamma}\sim(1, 0.00005)\) prior for \(\log\tau_{v}\), we want for example to specify a \(\mbox{logGamma}\sim(1, 0.1)\). To change the prior we use the hyper argument in the f function (see inla.models()$latent$iid$hyper).

In the following code we use also the option control.compute to compute the DIC value.

formula = y ~ 1 + x1 + 
  f(id,model="iid",
    hyper=list(prec=list(prior="loggamma",param=c(1,0.1))))

output2 = inla(formula,
                family = "poisson",
                data = LondonSuicides,
                E = E,
                control.compute=list(dic=TRUE)) 
summary(output2)
## 
## Call:
##    c("inla(formula = formula, family = \"poisson\", data = LondonSuicides, ", " E = E, 
##    control.compute = list(dic = TRUE))") 
## Time used:
##     Pre = 1.56, Running = 0.174, Post = 0.1, Total = 1.84 
## Fixed effects:
##               mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept) -0.206 0.122     -0.448   -0.206      0.034 -0.205   0
## x1           0.007 0.004     -0.001    0.007      0.016  0.007   0
## 
## Random effects:
##   Name     Model
##     id IID model
## 
## Model hyperparameters:
##                   mean    sd 0.025quant 0.5quant 0.975quant  mode
## Precision for id 31.86 11.51      14.80    30.05      59.38 26.68
## 
## Expected number of effective parameters(stdev): 20.61(2.49)
## Number of equivalent replicates : 1.60 
## 
## Deviance Information Criterion (DIC) ...............: 241.06
## Deviance Information Criterion (DIC, saturated) ....: 54.34
## Effective number of parameters .....................: 20.96
## 
## Marginal log-Likelihood:  -135.77 
## Posterior marginals for the linear predictor and
##  the fitted values are computed
prec.post2 = output2$marginals.hyperpar[[1]]

plot(prec.post2,type="l")
lines(prec.post,col=2)
legend("topright",col=1:2,lty=c(1,1),legend=c("New prior","Default prior"),box.lty=0)

But usually we want to make inference on more interpretable parameters, e.g. the variance or standard deviations. The function inla.tmarginal transforms a marginal distribution by using the desired transformation defined through function(...){}:

# Define the marginal variance 
var.post = inla.tmarginal(fun=function(x) 1/x, mar=prec.post2)
plot(var.post,type="l",xlab=expression(sigma^2))

As before, the function inla.emarginal computes the expected values of a given transformation.

# Compute the posterior mean of the variance
inla.emarginal(fun=function(x) 1/x, marg=prec.post2)
## [1] 0.03558442

3.6 Explore the random effect \(v\)

The objects summary.random and marginals.random contain the posterior summaries and marginals of the random effect \(v_i\), \(i=1,...,33\).

# The summary statistics for the random effects
names(output2$summary.random)
## [1] "id"
head(output2$summary.random[["id"]])
##   ID         mean        sd  0.025quant     0.5quant 0.975quant         mode          kld
## 1  1  0.089406950 0.1855924 -0.26570838  0.084874158  0.4706426  0.076278410 2.231327e-05
## 2  2 -0.080122252 0.1270878 -0.33723923 -0.077718602  0.1634663 -0.073088084 1.658365e-06
## 3  3 -0.002077095 0.1061223 -0.21383939 -0.001066596  0.2038143  0.000857712 2.974702e-07
## 4  4  0.182890373 0.1196775 -0.05034696  0.182101174  0.4200143  0.180297779 3.473602e-06
## 5  5  0.008564004 0.1072599 -0.20545703  0.009578714  0.2166711  0.011495853 2.496290e-07
## 6  6  0.169336487 0.1133045 -0.05178930  0.168687814  0.3936510  0.167247338 2.417594e-06
dim(output2$summary.random[[1]]) #nrow = n. of areas
## [1] 33  8
# The posterior marginals for the random effects
class(output2$marginals.random)
## [1] "list"
names(output2$marginals.random)
## [1] "id"
class(output2$marginals.random[["id"]])
## [1] "list"
length(output2$marginals.random[["id"]])
## [1] 33

Assume that we want to compute for the FIRST area the posterior mean of the exponentiated random effect:

inla.emarginal(fun=exp, marg=output2$marginals.random[["id"]][[1]])
## [1] 1.11272

We now want to repeat the previous computation for all the areas in order to produce a map of the relative risk of suicide relative to average across London. We will use lapply to apply the same function inla.emarginal to all the elements in a list:

exp.v = lapply(output2$marginals.random[["id"]],
                         function(x) inla.emarginal(exp,x))
head(exp.v) #it's a list
## $index.1
## [1] 1.11272
## 
## $index.2
## [1] 0.9304351
## 
## $index.3
## [1] 1.003539
## 
## $index.4
## [1] 1.209326
## 
## $index.5
## [1] 1.014398
## 
## $index.6
## [1] 1.192157
LondonSuicides$exp.v = unlist(exp.v) #it's a vector

We prepare everything for the map, i.e. we update the shapefile data table with the information contained in the LondonSuicides dataframe:

head(LondonSuicides)
##    GSS_CODE                 NAME  y    E    x1       SMR id     exp.v
## 1 E09000001       City of London  4  1.5 12.84 2.6666667  1 1.1127203
## 2 E09000002 Barking and Dagenham 34 37.5 34.49 0.9066667  2 0.9304351
## 3 E09000003               Barnet 68 71.6 21.16 0.9497207  3 1.0035393
## 4 E09000004               Bexley 57 46.6 16.21 1.2231760  4 1.2093261
## 5 E09000005                Brent 64 62.5 29.22 1.0240000  5 1.0143975
## 6 E09000006              Bromley 72 62.2 14.36 1.1575563  6 1.1921573
head(london.shp@data)
##                   NAME  GSS_CODE  HECTARES NONLD_AREA ONS_INNER SUB_2009 SUB_2006
## 0 Kingston upon Thames E09000021  3726.117      0.000         F     <NA>     <NA>
## 1              Croydon E09000008  8649.441      0.000         F     <NA>     <NA>
## 2              Bromley E09000006 15013.487      0.000         F     <NA>     <NA>
## 3             Hounslow E09000018  5658.541     60.755         F     <NA>     <NA>
## 4               Ealing E09000009  5554.428      0.000         F     <NA>     <NA>
## 5             Havering E09000016 11445.735    210.763         F     <NA>     <NA>
london.shp@data = merge(london.shp@data,LondonSuicides, by="GSS_CODE")
head(london.shp@data)
##    GSS_CODE               NAME.x  HECTARES NONLD_AREA ONS_INNER SUB_2009 SUB_2006
## 1 E09000001       City of London   314.942     24.546         T     <NA>     <NA>
## 2 E09000002 Barking and Dagenham  3779.934    169.150         F     <NA>     <NA>
## 3 E09000003               Barnet  8674.837      0.000         F     <NA>     <NA>
## 4 E09000004               Bexley  6428.649    370.619         F     <NA>     <NA>
## 5 E09000005                Brent  4323.270      0.000         F     <NA>     <NA>
## 6 E09000006              Bromley 15013.487      0.000         F     <NA>     <NA>
##                 NAME.y  y    E    x1       SMR id     exp.v
## 1       City of London  4  1.5 12.84 2.6666667  1 1.1127203
## 2 Barking and Dagenham 34 37.5 34.49 0.9066667  2 0.9304351
## 3               Barnet 68 71.6 21.16 0.9497207  3 1.0035393
## 4               Bexley 57 46.6 16.21 1.2231760  4 1.2093261
## 5                Brent 64 62.5 29.22 1.0240000  5 1.0143975
## 6              Bromley 72 62.2 14.36 1.1575563  6 1.1921573
# Mapping!
spplot(obj=london.shp, zcol=c("SMR", "exp.v"), at=c(0.6,0.8,0.95,1.05,1.2,3),col.regions=terrain.colors(5))

spplot(obj=london.shp, zcol="SMR", main="SMR", at=c(0.6,0.8,0.95,1.05,1.2,3),col.regions=terrain.colors(5))

spplot(obj=london.shp, zcol="exp.v", main="exp.v", at=c(0.6,0.8,0.95,1.05,1.2,3),col.regions=terrain.colors(5))

The following plot aims to show that, with the hierarchical model, estimated RRs (exp.v) are shrinked towards the mean (less variability than SMRs). We compare the SMRs with the estimated RRs.

op = par(mar = c(5,4,4,4) + 0.1)
plot(LondonSuicides$SMR,type="n",ylab="SMR",xaxt="n",xlab="")
for(i in 1:nrow(LondonSuicides)){
  segments(1,LondonSuicides$SMR[i],
           nrow(LondonSuicides),LondonSuicides$exp.v[i])
}
axis(side = 4)
mtext("estimated RR", side = 4, line = 3, cex = par("cex.lab"))

3.7 Compute the map of excess risk

We are now interested in mapping the excess risk defined as \(p(\exp(v)>1\mid \mathbf y)\) or equivalently as \(p(v>0\mid \mathbf y)\). We will use inla.pmarginal (distribution function) inside lapply to compute the required probality:

threshold = 0
prob.v = lapply(output2$marginals.random[["id"]],
                          function(x) 1-inla.pmarginal(threshold,x))
range(unlist(prob.v))
## [1] 0.02353732 0.97369162
# Add the probability vector to the dataframe
LondonSuicides$prob.v = unlist(prob.v)
head(LondonSuicides)
##    GSS_CODE                 NAME  y    E    x1       SMR id     exp.v    prob.v
## 1 E09000001       City of London  4  1.5 12.84 2.6666667  1 1.1127203 0.6833279
## 2 E09000002 Barking and Dagenham 34 37.5 34.49 0.9066667  2 0.9304351 0.2642949
## 3 E09000003               Barnet 68 71.6 21.16 0.9497207  3 1.0035393 0.4945853
## 4 E09000004               Bexley 57 46.6 16.21 1.2231760  4 1.2093261 0.9377448
## 5 E09000005                Brent 64 62.5 29.22 1.0240000  5 1.0143975 0.5344875
## 6 E09000006              Bromley 72 62.2 14.36 1.1575563  6 1.1921573 0.9333485
# Update the shapefile data table only with the new column
london.shp@data = merge(london.shp@data,
                        LondonSuicides[,c("GSS_CODE","prob.v")],
                        by="GSS_CODE", sort=FALSE)
head(london.shp@data)
##    GSS_CODE               NAME.x  HECTARES NONLD_AREA ONS_INNER SUB_2009 SUB_2006
## 1 E09000001       City of London   314.942     24.546         T     <NA>     <NA>
## 2 E09000002 Barking and Dagenham  3779.934    169.150         F     <NA>     <NA>
## 3 E09000003               Barnet  8674.837      0.000         F     <NA>     <NA>
## 4 E09000004               Bexley  6428.649    370.619         F     <NA>     <NA>
## 5 E09000005                Brent  4323.270      0.000         F     <NA>     <NA>
## 6 E09000006              Bromley 15013.487      0.000         F     <NA>     <NA>
##                 NAME.y  y    E    x1       SMR id     exp.v    prob.v
## 1       City of London  4  1.5 12.84 2.6666667  1 1.1127203 0.6833279
## 2 Barking and Dagenham 34 37.5 34.49 0.9066667  2 0.9304351 0.2642949
## 3               Barnet 68 71.6 21.16 0.9497207  3 1.0035393 0.4945853
## 4               Bexley 57 46.6 16.21 1.2231760  4 1.2093261 0.9377448
## 5                Brent 64 62.5 29.22 1.0240000  5 1.0143975 0.5344875
## 6              Bromley 72 62.2 14.36 1.1575563  6 1.1921573 0.9333485
# Mapping!
spplot(obj=london.shp, zcol="prob.v",main="", at=c(0,0.2,0.8,1),col.regions=c("green","white","red"))

4 Poisson-logNormal model with covariate and spatial structure

We implement the following BYM model (see slide 41 of lecture slide): \[ O_i\sim \mbox{Poisson}(\phi_i E_i) \;\;\; i=1,...,33 \] where \(\phi_i\) is the relative risk.

The linear predictor is defined on the logarithm scale and is given by \[ \eta_i=\log(\phi_i)=\beta_0 + \beta_1 x_{1i}+ v_{i} + u_i \] where \(\beta_0\) is the intercept, \(x_{1i}\) is the value of the covariate \(x_1\) for the i-th area, \(\beta_1\) is the covariate coefficient. \(v_i\) represents the random effect which is assumed to be normally distributed with mean equal to zero and precision \(\tau_v\).

For the spatially structured random effect we consider the following prior distribution \(\mathbf u ∼ ICAR(\mathbf W, 1/\tau_u)\) where \(\mathbf W\) is the neighbourhood matrix.

For both the precision hyperparameters \(\tau_v\) (prec.unstruct) and \(\tau_v\) (prec.spatial) we specify a logGamma(1,0.1) prior distribution (see inla.models()$latent$bym$hyper for the default priors).

The new R-INLA formula will use the bym model (see inla.doc("bym")). The neighbourhood structure is passed through the graph argument.

formula = y ~ 1 + x1 +
  f(id, model="bym",graph=london.adj.path,
          hyper=list(prec.unstruct=list(prior="loggamma",param=c(1,0.1)), 
                     prec.spatial=list(prior="loggamma",param=c(1,0.1))))

output3 = inla(formula,
                family = "poisson",
                data = LondonSuicides,
                E = E,
                control.compute=list(dic=TRUE))

We explore the random effects posterior summaries:

names(head(output3$summary.random))
## [1] "id"
dim(output3$summary.random$id)
## [1] 66  8
head(output3$summary.random$id)
##   ID         mean        sd  0.025quant     0.5quant 0.975quant         mode          kld
## 1  1  0.125643369 0.2206527 -0.29835206  0.121413846  0.5740109  0.113054158 5.761496e-05
## 2  2 -0.101818668 0.1361667 -0.37638239 -0.099483010  0.1595055 -0.094957220 6.023781e-06
## 3  3 -0.006548265 0.1100048 -0.22611863 -0.005419169  0.2065168 -0.003230102 1.455352e-06
## 4  4  0.221726172 0.1256588 -0.02657863  0.222129691  0.4674728  0.222834955 6.980476e-06
## 5  5  0.019735905 0.1116661 -0.20395003  0.021185836  0.2351433  0.024010995 1.408557e-06
## 6  6  0.188635268 0.1225544 -0.05314327  0.188904339  0.4286531  0.189386989 5.360786e-06
tail(output3$summary.random$id)
##    ID          mean         sd 0.025quant      0.5quant 0.975quant          mode          kld
## 61 61  0.0090271220 0.09222179 -0.1714838  0.0077255555  0.1970771  0.0054533071 3.151121e-05
## 62 62  0.0135219212 0.09497554 -0.1711367  0.0117527640  0.2085073  0.0086591464 3.640570e-05
## 63 63  0.0105496325 0.09476327 -0.1750568  0.0093154276  0.2032653  0.0071328843 3.162866e-05
## 64 64 -0.0044245215 0.09395840 -0.1918650 -0.0044414376  0.1828403 -0.0044646501 2.595046e-05
## 65 65  0.0003860835 0.10463273 -0.2072044  0.0001304076  0.2090899 -0.0003735834 2.094006e-05
## 66 66  0.0038451840 0.08682311 -0.1662534  0.0026227609  0.1812640  0.0006648265 3.958505e-05

The latter is a dataframe formed by \(2n\) rows: the first \(n\) rows include information on the area specific residuals \(z_i=u_i+v_i\), which are the primary interest in a disease mapping study, while the remaining present information on the spatially structured residual \(u_i\) only. Recall that all these parameters are on the logarithmic scale; for the sake of interpretability it would be more convenient to transform them back to the natural scale.

The computation of the posterior mean for the random effects \(z_i=v_i+u_i\) is performed using lapply as shown before:

LondonSuicides$exp.z = unlist(lapply(output3$marginals.random$id[1:nrow(LondonSuicides)],
                                     function(x) inla.emarginal(exp,x)))

We compute also the probability \(p(z_i>1\mid \mathbf y)\) (or equivalently \(p(u_i+v_i>0\mid \mathbf y)\) which is easier to obtain) using the built-in function inla.pmarginal:

LondonSuicides$prob.z = unlist(lapply(output3$marginals.random$id[1:nrow(LondonSuicides)],
                               function(x) 1-inla.pmarginal(threshold,x)))
head(LondonSuicides)
##    GSS_CODE                 NAME  y    E    x1       SMR id     exp.v    prob.v     exp.z    prob.z
## 1 E09000001       City of London  4  1.5 12.84 2.6666667  1 1.1127203 0.6833279 1.1620877 0.7146642
## 2 E09000002 Barking and Dagenham 34 37.5 34.49 0.9066667  2 0.9304351 0.2642949 0.9115456 0.2268014
## 3 E09000003               Barnet 68 71.6 21.16 0.9497207  3 1.0035393 0.4945853 0.9994768 0.4788570
## 4 E09000004               Bexley 57 46.6 16.21 1.2231760  4 1.2093261 0.9377448 1.2581052 0.9603001
## 5 E09000005                Brent 64 62.5 29.22 1.0240000  5 1.0143975 0.5344875 1.0262759 0.5741258
## 6 E09000006              Bromley 72 62.2 14.36 1.1575563  6 1.1921573 0.9333485 1.2166914 0.9375520
# Update the shapefile data table 
london.shp@data = merge(london.shp@data,
                        LondonSuicides[,c("GSS_CODE","exp.z","prob.z")],
                        by="GSS_CODE", sort=FALSE)
head(london.shp@data)
##    GSS_CODE               NAME.x  HECTARES NONLD_AREA ONS_INNER SUB_2009 SUB_2006
## 1 E09000001       City of London   314.942     24.546         T     <NA>     <NA>
## 2 E09000002 Barking and Dagenham  3779.934    169.150         F     <NA>     <NA>
## 3 E09000003               Barnet  8674.837      0.000         F     <NA>     <NA>
## 4 E09000004               Bexley  6428.649    370.619         F     <NA>     <NA>
## 5 E09000005                Brent  4323.270      0.000         F     <NA>     <NA>
## 6 E09000006              Bromley 15013.487      0.000         F     <NA>     <NA>
##                 NAME.y  y    E    x1       SMR id     exp.v    prob.v     exp.z    prob.z
## 1       City of London  4  1.5 12.84 2.6666667  1 1.1127203 0.6833279 1.1620877 0.7146642
## 2 Barking and Dagenham 34 37.5 34.49 0.9066667  2 0.9304351 0.2642949 0.9115456 0.2268014
## 3               Barnet 68 71.6 21.16 0.9497207  3 1.0035393 0.4945853 0.9994768 0.4788570
## 4               Bexley 57 46.6 16.21 1.2231760  4 1.2093261 0.9377448 1.2581052 0.9603001
## 5                Brent 64 62.5 29.22 1.0240000  5 1.0143975 0.5344875 1.0262759 0.5741258
## 6              Bromley 72 62.2 14.36 1.1575563  6 1.1921573 0.9333485 1.2166914 0.9375520
# Mapping!
spplot(obj=london.shp, zcol="exp.z", at=c(0.6,0.8,0.95,1.05,1.2,3),col.regions=terrain.colors(5))

spplot(obj=london.shp, zcol="prob.z",main="",at=c(0,0.2,0.8,1),col.regions=c("green","white","red"))

5 Model comparison

Let’s compare the two models (the one with \(v\) only and the bym model) by DIC.

output2$dic$dic #model with covariate + iid
## [1] 241.0645
output3$dic$dic #model with covariate + bym
## [1] 242.2113